##### ####################################################
#####                                               ######
#####                 Topic prep             
#####                                               ######
##### ####################################################

# init ------------------------------------------------------------

rm(list=ls())
set.seed(221186)

# Load libraries

library(data.table) # 1.11.4
library(quanteda) # 1.3.4
library(stm) # 1.3.3

# Load data

load("data/speeches.Rdata")

speeches <- speeches[speeches$minister_in_debate]

### Collapse ministerial speeches within debate

texts <- speeches[,list(text = paste(body,collapse=""),
                        ministry = unique(debate_department)),
                  by = subsection_id]

### Process texts for STM

out <- textProcessor(texts$text, metadata = data.frame(texts)[,-2],
                     removestopwords = T, removepunctuation = T, removenumbers = T, stem = T, 
                     sparselevel = .995, language = "en", 
                     verbose = TRUE) 

prep <- prepDocuments(out$documents, out$vocab, out$meta)

save(prep, file="working/topics/prep.Rdata")

### Loop over topic counts

number.of.categories <- seq(20,90,5)

for(t in number.of.categories){

  cat(paste0("Estimating a model with ",t," topics.\n"))
  
  mod.out <- stm(prep$documents, prep$vocab, t, max.em.its = 600, seed = 221186, verbose = FALSE)

  cat(paste0(t," topics took ", round(mod.out$time/60)," minutes.\n"))

  save(mod.out,file=paste("working/topics/mod.out/all_", t,".Rdata",sep=""))

}